# coding: utf-8
# 文件名称: douban_spider.py
# 创建时间: 2021/6/2 18:35

import requests
from lxml import etree

"""
需求：爬取豆瓣深圳正在上映的电影

"""
url = "https://movie.douban.com/cinema/nowplaying/shenzhen/"
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66',
    'Referer':'https://movie.douban.com/'
}


# 1. 爬取数据
html_file = requests.get(url, headers=headers)
text = html_file.text

# 2. 解析数据

html = etree.HTML(text)
# html = etree.tostring(ht, encoding='utf8').decode('utf8')

# 获取格式化数据
ul = html.xpath("//ul[@class='lists']")[0]
lis = ul.xpath('./li')

dy_ls = []

for li in lis:
    title  = li.xpath('@data-title')
    score = li.xpath('@data-score')
    src = li.xpath('.//img/@src')[0]

    dic = {
        'title':title,
        'score':score,
        'img':src
    }
    dy_ls.append(dic)

print(dy_ls)
